#delimit;
set more off;
set logtype text;
capture log close templog;
pause on;

** REPLACE FILE PATH WITH PATH TO RELEVANT REPLICATION FILES;
local fileloc = "~/KMS_REPLICATION";

log using `fileloc'/log_files/importing_hourly_traffic.txt, replace name(templog);

**************************************************************************;
**************CONSTRUCT DATA SET OF ALL HOURLY TRAFFIC INFO***************;
**************************************************************************; 

clear all;
    
** We obtained PeMS data from the PeMS website – see http://pems.dot.ca.gov. Obtaining sensor level data requires an account. Registration is currently free and possible on the PeMS website.

**Will insheet all available data from PeMS traffic database and combine into one giant data file;
         
**Note: Some regions do not have all years and months included in this do-file. The capture command is used in the loops below to avoid this problem and simplify code. Individual data files must be checked to find the start and end dates of relevant sensors;
 
**Imputing data from txt files;
 
**macro for each available region;
**NOTE: Though region 04 has data from 9/2001, we only use from 11/2001 on. This is because data appear bad for 10/2001, and emails with techs at PeMS indicate data is spotty in that region for the two month period;
 
 **NOTE REGION 10 IS OMITTED DUE TO HAVING NO DATA UNTIL 2006;
 **NOTE REGION 6 IS OMITTED SINCE DATA WERE UNRELIABLE AT TIME OF ANALYSIS (AND only begins in 2005);
 
foreach region in 03 04 07 08 11 12 { ;
 	
  	**macro for each used year ;

	foreach year in 2002 2003 2004 2005 2006 2007 { ;
 
	**macro for insheeting all individual year data sets - these are MONTHS, not regions;

		foreach month in 01 02 03 04 05 06 07 08 09 10 11 12 {;
			** stored data as compressed using gzip;
			shell gunzip `fileloc'/data/traffic_data/text_data/d`region'_data/d`region'_text_station_hour_`year'_`month'.txt.gz;
			insheet using `fileloc'/data/traffic_data/text_data/d`region'_data/d`region'_text_station_hour_`year'_`month'.txt, nonames clear;
			
			quietly {;
			
				**Next step is to rename variables, currently stored as "v1, v2,...";
				 
				 keep v1 v2 v5 v6 v7 v10 v11 v12 v19 v22 v25 v28 v31;			

				 rename v1 dt_time       ;
				 rename v2 station       ;
				 rename v5 direction     ;
				 rename v6 stn_type      ;
				 rename v7 length        ;
				 rename v10 tot_flow     ;
				 rename v11 avg_occ      ;
				 rename v12 avg_spd      ;
				 rename v19 flow_1       ;       
				 rename v22 flow_2       ;
				 rename v25 flow_3       ;
				 rename v28 flow_4       ;       
				 rename v31 flow_5       ;
				 				 								
				 **FF = Fwy-Fwy;
				 **ML = Mainline;
				 **CD = Coll/Dist;
				 **HV = HOV;
				 **FR = Off ramp;
				 **OR = On ramp; 
				 
				 **Drop onramp and offramp sensors;
				 drop if stn_type=="FR"|stn_type=="OR" ; 
				 
				 **Count number of lanes for each sensor - this will be used for some averaging in a few steps;
				 gen lanes = 1;
				 foreach lane in 2 3 4 5 {;
				 	replace lanes = `lane' if flow_`lane' ~= . ;
				 };
				 				 
				**Format date in non-string form;
				**This changes format of the date found in the PeMS traffic data. It is written to be run either as part of a larger do file ("insheet_traffic.do") or to be run independently. It  assumes date is in the format "mm/dd/yyyy time" and that date/time info is in a variable listed "dt_time";   
				
				** Can comment out if using daily totals, as done in our analysis;
												 
				**Imputing dates from Traffic Data;																
														
				**gen hour=word(dt_time,2) ;
				**destring hour, ignore(":") replace ;
				**Stores second word in "dt_time" as time stamp and eliminates ":" list ;
				**replace hour = hour/10000;
				**Eliminates trailing zeros on time format;	
			
				gen double date = clock(dt_time, "MDYhms");
				format date %tc;
				
				**gen dayofwk = dow(date2);
													 
				keep station length date tot_flow avg_spd avg_occ direction lanes stn_type;
				rename station id;
				sort id ;

				save `fileloc'/data/traffic_data/dta_files/d`region'_`year'_`month'.dta, replace;
			
			} /* quietly close */ ;
			
			** Rezip large files;
			shell gzip `fileloc'/data/traffic_data/text_data/d`region'_data/d`region'_text_station_hour_`year'_`month'.txt;
			
		};

	};

};

clear;

log close templog;